######################
##### PREPARATION ####
######################
rm(list=setdiff(ls(), "path_to"))

# Load data
s = read_sav(path_to("robustness_beliefs_raw"))
benchmark = read.csv2(path_to("demographics_benchmark"), as.is=T)

# Drop respondents: illegal or didn't reach main part
s = s[!s$PROLIFIC_PID %in% c("", NA, "6136f25a4906e4c2da32bc2b"),]
s = s[!duplicated(s$PROLIFIC_PID), ]
s = s[!is.na(s$education), ]

# Drop columns
drop = c(
  colnames(s)[grepl("time", colnames(s))],
  colnames(s)[grepl("attention_", colnames(s))],
  colnames(s)[grepl("meta", colnames(s))],
  setdiff(colnames(s)[grepl("^vig_", colnames(s))], "vig_reduction"),
  colnames(s)[grepl("^conseq_", colnames(s))],
  "consent", "STUDY_ID", "SESSION_ID", "b", "hidePrev"
)
s[, drop] = NULL


##############################################################
########## DEMOGRAPHICS ######################################
##############################################################
# Education
s$education_college = 1*(s$education %in% c(5,6))
s$education_nocollege = 1*(!s$education %in% c(5,6))

# Income
s$income = plyr::mapvalues(
  s$income,
  from = c(1:8),
  to = c(7500, 20000, 37500, 62500, 87500, 125000, 175000, 225000)
)
s$income_log = log(s$income)
s$income_below50 = 1*(s$income<50000)
s$income_50_100 =  1*(s$income>50000 & s$income<=100000)
s$income_100plus = 1*(s$income>100000)

# Region
states = read.csv2(path_to("states_encoding"), as.is=T)
s$region = as.character(plyr::mapvalues(as.numeric(s$state), from = states$code, to = states$region))
s$region_midwest = s$region == "midwest"
s$region_northeast = s$region == "northeast"
s$region_south = s$region == "south"
s$region_west = s$region == "west"

# Gender
s$gender_female = 1*(s$gender %in% 2)
s$gender_other = 1*(!s$gender %in% 2)

# Age
s$age_below35 = 1*(s$age < 35)
s$age_35_54 = 1*(s$age >= 35 & s$age < 55)
s$age_55plus = 1*(s$age >= 55)

# Politics
s$politics_democrat = 1*(s$politics == 2)
s$politics_independent = 1*(s$politics == 3)
s$politics_republican = 1*(s$politics == 1)

# Ethnicity
races = c("white", "black", "hispanic", "indigenous", "asian")
for (r in 1:length(races)) {
  s[, paste0("race", races[r], "_yes")] = 1*(!is.na(s[, paste0("race_", r)]))
  s[, paste0("race", races[r], "_no")] = 1*(is.na(s[, paste0("race_", r)]))
}
s$race_indigenous = NULL


##############################################################
########## DROP INCOMPLETE ###################################
##############################################################
# Drop respondents who did not complete the survey
# Drop extreme response duration
s = s[!is.na(s$politics), ]
keep = c(
  s %>% filter(condition == "production") %>%
    filter(Duration__in_seconds_ > quantile(.$Duration__in_seconds_, c(0.01)) &
          Duration__in_seconds_ < quantile(.$Duration__in_seconds_, c(0.99))) %>%
    .[, "ResponseId", drop=T],
  s %>% filter(condition == "numeric") %>%
    filter(Duration__in_seconds_ > quantile(.$Duration__in_seconds_, c(0.01)) &
           Duration__in_seconds_ < quantile(.$Duration__in_seconds_, c(0.99))) %>%
    .[, "ResponseId", drop=T],
  s %>% filter(condition == "beliefs_incentives") %>%
    filter(Duration__in_seconds_ > quantile(.$Duration__in_seconds_, c(0.01)) &
           Duration__in_seconds_ < quantile(.$Duration__in_seconds_, c(0.99))) %>%
    .[, "ResponseId", drop=T]
)
s = s[s$ResponseId %in% keep,]


##############################################################
########## CLEAN BELIEF DATA #################################
##############################################################
## QUAL BELIEFS
s$effect = ifelse(s$condition == "production", s$effect_prod,
           ifelse(s$condition == "beliefs_incentives", s$effect_inc_fuel,
           ifelse(s$condition == "numeric",
              ifelse(s$effect_num < 0, 5,
              ifelse(s$effect_num == 0, 4,
              ifelse(s$effect_num > 0 & s$effect_num < 200, 3,
              ifelse(s$effect_num == 200, 2,
              ifelse(s$effect_num > 200, 1, NA))))), NA
            )))

# Indicator for dampening
s$belief_dampening = 1*(s$effect %in% c(3, 4))

## EXPLANATION
s$explanation = ifelse(s$condition == "production", s$explanation_prod,
                ifelse(s$condition == "beliefs_incentives", s$explanation_inc_fuel,
                ifelse(s$condition == "numeric", s$explanation_num, NA)))


##############################################################
########## CONSUMPTION DATA ##################################
##############################################################
# Relevant consumers for respective belief case
s$consumer_vig = 1*ifelse(s$vig == "meat", s$meat >= 1,
                   ifelse(s$vig == "fuel", s$miles >= 5000,
                   ifelse(s$vig == "flights", s$flights >= 1,
                   ifelse(s$vig == "energy", s$energybill >= 500,
                   ifelse(s$vig == "subelectricity", s$electricity >= 500,
                   ifelse(s$vig == "subhouse", s$energybill >= 500,
                   ifelse(s$vig == "subcoffee", s$coffee >= 1 & s$coffee_fairtrade_1 >= 50,
                   ifelse(s$vig == "subclothing", s$clothing_secondhand_1 >= 50, NA))))))))


##############################################################
########## WEIGHTS ###########################################
##############################################################
# Define target levels and weighting variables
target = benchmark$freq
names(target) = benchmark$variable
cats = unique(gsub("_.*", "", names(target)))
target_ls = list()
for (c in cats) {
  vars = names(target)[grepl(c, names(target))]
  target_ls[[paste0(c, "_wgt")]] = target[vars]
  s[, paste0(c, "_wgt")] = factor(sapply(1:nrow(s), function(i) {
    s[i, vars] %>% as.logical %>% which %>% vars[.]
  }))
}
  
# Rake!
rake = suppressWarnings(anesrake(
  inputter = target_ls, dataframe = as.data.frame(s), caseid = s$ResponseId,
  weightvec = rep(1, nrow(s)), cap = 5, verbose = F, type = "nolim"
))
# Note: Weighted sample is very close to target!
#summary(rake)
#hist(rake$weightvec)
#quantile(rake$weightvec, c(0, 0.01, 0.05, 0.1, 0.2, 0.8, 0.9, 0.95, 0.99, 1))

# Add to data
s$wgt = rake$weightvec[s$ResponseId]


##############################################################
########## SAVE DATA #########################################
##############################################################
saveRDS(s, path_to("robustness_beliefs"))

